######################################
# Media Measurement Matters          #
# Replication Code                   #
# Appendix L: Hard News URLs         #
######################################

# The following file contains code for replicating the figures and analyses in
# Appendix L. This section focuses on the subset of URLs in our web-tracking 
# data that are likely to correspond to "hard news" stories. Information on how
# we identified these URLs is available in the script entitled 1_web_data_prep.

# Set-Up ----

# If desired, set up path into which to save plots and tables
plot_path <- NULL
table_path <- NULL

# Load packages
library(tidyverse)
library(ggridges)
library(ggpubr)
library(estimatr)
library(overlap)
library(gtools)

# Set up helper operations
`%notin%` <- Negate(`%in%`)

# Set up colors
red_mit = '#A31F34'
red_light = '#A9606C'
blue_mit = '#315485'
grey_light= '#C2C0BF'
grey_dark = '#8A8B8C'
black = '#353132'

# Source helper functions
source("helper_functions.R")

# > Read in data ----

# Read in survey data
srvy <- read_rds("data/survey_data_cleaned.rds")

# Read in web data and remove portal sites
web <- read_rds("data/web_use.rds")

# Filter out portal sites, soft news URLs
web_noport <- web %>% 
  filter(domain_recode %notin% c("aol.com", "msn.com", "google.com"))

web_hard <- web_noport %>% 
  filter(hard_news == 1)

# Most visited domains ----

# Figure L1: plot the proportion of visits to the top 25 most widely-visited domains
# in the dataset that are predicted to be "hard news" using our dictionary-based classifier.

# Calculate the proportion of hard vs. soft news URLs for each domain
# Note that homepage visits are considered soft news visits for this analysis
hard_props <- web %>% 
  drop_na(avg_align) %>% 
  group_by(domain_recode) %>% 
  summarise(hard_prop = mean(hard_news == 1, na.rm = T),
            n = n(),
            score = mean(avg_align)) %>% 
  arrange(desc(n))

# Figure L1(a): plot the top 25 most popular domains, with soft news visits
# including homepage visits for all domains
(l1_a <- ggplot(hard_props[1:25,], aes(x = domain_recode, y = hard_prop)) + 
    geom_bar(stat = "identity") + 
    theme_bw() + 
    ylab("Proportion of Hard News URLs") +
    xlab("") + 
    theme(axis.text.x = element_text(angle = 45, hjust = 1),
          plot.title = element_text(hjust = 0.5)) + 
    scale_fill_gradient(low = "blue", high = "red", name = "Align Score") +
    scale_x_discrete(limits=hard_props[1:25,]$domain_recode) +
    geom_text(aes(label = format(round(hard_prop, 2), nsmall = 2)), vjust = -0.25) + 
    scale_y_continuous(expand = c(0,0),
                       limits = c(0, 1.1)) + 
    theme(legend.position = "none",
          plot.title = element_blank(),
          plot.subtitle = element_text(hjust = 0.5, face = "italic", size = 12),
          axis.title.x = element_text(margin = unit(c(3, 0, 0, 0), "mm"),
                                      size = 12),
          axis.title.y = element_text(margin = unit(c(0, 3, 0, 0), "mm"), 
                                      size = 12),
          legend.title = element_text(hjust = 0.5),
          axis.text.x = element_text(size = 10, color = "black"),
          axis.text.y = element_text(size = 10, color = "black")) 
)
ggsave(l1_a, path = plot_path, filename = "fig_l1_a.pdf",
       height=6, width=9, dpi = 600)

# Re-run this same analysis excluding homepage visits, including visits
# to the Yahoo! News landing page
yahoo_urls <- c("https://www.yahoo.com/news/", "https://www.yahoo.com/news", 
                "http://www.yahoo.com/news/", "http://www.yahoo.com/news")

hard_props_home <- web %>% 
  drop_na(avg_align) %>% 
  group_by(domain_recode) %>% 
  summarise(hard_prop = mean(hard_news_nohome == 1, na.rm = T),
            n = n(),
            score = mean(avg_align)) %>% 
  arrange(desc(n))

# Figure L1(b): plot the top 25 most popular domains, with soft news visits
# *excluding* homepage visits for all domains
(l1_b <- ggplot(hard_props_home[1:25,], aes(x = domain_recode, y = hard_prop)) + 
    geom_bar(stat = "identity") + 
    theme_bw() + 
    ylab("Proportion of Hard News URLs") +
    xlab("") + 
    ggtitle("Hard vs. Soft News: Top 25 Domains (Matched)") + 
    theme(axis.text.x = element_text(angle = 45, hjust = 1),
          plot.title = element_text(hjust = 0.5)) + 
    scale_fill_gradient(low = "blue", high = "red", name = "Align Score") +
    scale_x_discrete(limits=hard_props_home[1:25,]$domain_recode) +
    geom_text(aes(label = format(round(hard_prop, 2), nsmall = 2)), vjust = -0.25) + 
    scale_y_continuous(expand = c(0,0),
                       limits = c(0, 1.1)) + 
    theme(legend.position = "none",
          plot.title = element_blank(),
          plot.subtitle = element_text(hjust = 0.5, face = "italic", size = 12),
          axis.title.x = element_text(margin = unit(c(3, 0, 0, 0), "mm"),
                                      size = 12),
          axis.title.y = element_text(margin = unit(c(0, 3, 0, 0), "mm"), 
                                      size = 12),
          legend.title = element_text(hjust = 0.5),
          axis.text.x = element_text(size = 10, color = "black"),
          axis.text.y = element_text(size = 10, color = "black")) 
)
ggsave(l1_b, path = plot_path, filename = "fig_l1_b.pdf",
       height=6,width=9, dpi = 600)

# Measurement validation ----

# Read in MTurk data
mt <- read_csv("data/url_mturk.csv")

# Read in dictionary-based classifier results for the sample of URLs
class <- read_csv("data/url_ratings.csv")

# > Worker information ----

# Number of URLs rated by each worker
urls_rated <- mt %>% 
  group_by(id) %>% 
  summarise(n = n()) %>% 
  arrange(desc(n))

summary(urls_rated$n) # Range: 1-267 (M = 23, mean = 45.5)
nrow(urls_rated) # 220 unique raters

# > Process MT ratings ----

mt <- mt %>% 
  mutate(rate_num = case_when(rating == "Yes" ~ 1, TRUE ~ 0), # Yes vs. No/Not sure
         rate_dk = case_when(rating == "Not Sure" ~ 1, TRUE ~ 0)) # Not sure responses

# For each URL, calculate the percent of responses classifying that URL as hard news,
# as well as the percent indicating they were unsure and the number of unique responses
ratings <- mt %>% 
  group_by(url) %>% 
  summarise(prop_news = mean(rate_num, na.rm = T),
            prop_dk = mean(rate_dk, na.rm = T),
            num_vals = n_distinct(rating))

table(ratings$num_vals) 
  # On all but 477 URLs, there was some disagreement between raters

ratings %>% 
  filter(num_vals == 1) %>% 
  select(prop_news, prop_dk) %>% 
  summary()
  # Almost all of these (94%) were cases where raters unanimously rated
  # a URL as news. None were cases where every rater said "Not sure."

sum(ratings$prop_dk >= 0.5)
  # There are a very small number of cases where a majority of raters (3/5)
  # selected "Not sure"

# > Classifier performance ----

# For each URL, classify it as hard news if >50% of raters (at least 3/5) indicated
# it was likely to correspond to a news story.
ratings <- ratings %>% 
  mutate(mt_news = case_when(prop_news >= 0.5 ~ 1, TRUE ~ 0)) %>% 
  # Merge in the scores from our dictionary-based classifier
  left_join(class %>% select(url = url_clean, class_news = hard_news), by = "url") %>% 
  mutate(match = case_when(mt_news == class_news ~ 1, mt_news != class_news ~ 0))

mean(ratings$match) # 75.2% of classifications match

# Compare predictions (dictionary-based classifier) with reference scores (MT)
news_ct <- table(ratings$class_news, ratings$mt_news)

# Calculate performance metrics
accuracy <- sum(diag(news_ct))/2000
precision <- news_ct[2,2]/sum(news_ct[2,])
recall <- news_ct[2,2]/sum(news_ct[,2])

accuracy; precision; recall

# Descriptive analysis ----

# > Visit-level analysis ----

# Figure L2: distribution of visit-level alignment scores by stated media
# preferences, excluding portals and focusing just on visits to hard news URLs
l2 <- visit_plots(var = "med_pref", var_labels = c("Prefer\nMSNBC", 
                                                   "Prefer\nEntertainment", 
                                                   "Prefer\nFox"),
                  var_levels = c("MSNBC", "Entertainment", "Fox"),
                  x_label = "Relative Slant of News Visits (Hard News Only)",
                  y_label = "Stated Media Preference", weights = FALSE, df = web_hard, 
                  exemplars = c("cnn.com","news.yahoo.com","msnbc.com",
                                "foxnews.com","nytimes.com"))

ggsave(l2, path = plot_path, filename = "fig_l2.pdf",
       height= 3.5, width = 8, dpi = 600)

# Estimate overlapping coefficient across stated preference groups
visit_overlap_medpref <- visit_overlap(df = web_hard,
                                       group_var = "med_pref", 
                                       values = c("MSNBC", "Entertainment", "Fox"),
                                       out_contrasts = c("MSNBC vs. Fox",
                                                         "MSNBC vs. Entertainment",
                                                         "Fox vs. Entertainment"))

(site_overlap_hard <- visit_overlap_medpref$`MSNBC vs. Fox`)

# > Respondent-level analysis ----

# Figure L3(a): distribution of respondent-level volume by stated media
# preferences, focusing just on visits to hard news URLs
l3_a <- score_plots(var = "med_pref", var_labels = c("Prefer\nMSNBC", 
                                                     "Prefer\nEntertainment", 
                                                     "Prefer\nFox"),
                    var_levels = c("MSNBC", "Entertainment", "Fox"), x_var = "hard_perc",
                    x_label = "Proportion of Visits to News Domains\n(Hard News Only)",
                    y_label = "Stated Media Preference", by = 0.25,
                    height = 1.85, x_limits = c(-0.1, 0.85))

ggsave(l3_a, path = plot_path, filename = "fig_l3_a.pdf",
       height= 3, width = 4.5, dpi = 600)

# Figure L3(b): distribution of respondent-level slant by stated
# media preferences, excluding portals and focusing just on visits to hard news URLs
l3_b <- score_plots(var = "med_pref", var_labels = c("Prefer\nMSNBC", 
                                                     "Prefer\nEntertainment", 
                                                     "Prefer\nFox"),
                    var_levels = c("MSNBC", "Entertainment", "Fox"), x_var = "score_hard",
                    x_label = "Respondent Avg. Slant Score\n(Hard News Only)",
                    y_label = "Stated Media Preference")

ggsave(l3_b, path = plot_path, filename = "fig_l3_b.pdf",
       height= 3, width = 4.5, dpi = 600)

# Estimate overlapping coefficient across stated preference groups
overlap_medpref <- score_overlap(var = "score_hard", group_var = "med_pref", 
                                 values = c("MSNBC", "Entertainment", "Fox"),
                                 out_contrasts = c("MSNBC vs. Fox",
                                                   "MSNBC vs. Entertainment",
                                                   "Fox vs. Entertainment"))

(resp_overlap_hard <- overlap_medpref$`MSNBC vs. Fox`)

# Experimental analysis ----

# Set theme for plotting
theme_set(theme_bw() + 
            theme(legend.position = "bottom",
                  plot.title = element_text(hjust = 0.5, face = "bold",size = 16),
                  plot.subtitle = element_text(hjust = 0.5, face = "italic", size = 12),
                  axis.title.x = element_text(margin = unit(c(3, 0, 0, 0), "mm"),
                                              face = "bold", size = 12, angle = 0, hjust = 0.5),
                  axis.title.y = element_text(margin = unit(c(0, 3, 0, 0), "mm"), 
                                              face = "bold", size = 12),
                  legend.title = element_text(face = "bold", hjust = 0.5, size = 12),
                  legend.text = element_text(hjust = 0.5, size = 10),
                  axis.text.y = element_text(size = 10, color = "black"),
                  legend.box = "vertical",
                  legend.background = element_blank(),
                  legend.box.background = element_rect(colour = "black"),
                  text=element_text(colour=black, 
                                    size=15)))

# > Group respondents into preference strata ----

# For the relative volume measure, split respondents into terciles.
srvy <- srvy %>% 
  mutate(hard_news_bin3 = ntile(hard_perc, 3))

# Compare number of respondents assigned to each relative slant bin across the
# full sample of URLs vs. sample with hard news URLs only
srvy %>% 
  filter(forcedchoice == 1) %>% 
  rename(score_code_orig = score_code) %>% 
  select(score_code_orig, score_code_hard) %>% 
  apply(MARGIN = 2, table)

srvy %>% 
  filter(forcedchoice == 1) %>% 
  rename(score_code_orig = score_code) %>% 
  select(score_code_orig, score_code_hard) %>% 
  table()

# > Relative volume results ----

# Define labels for plotting
consump_labels3 <- gen_ranges(bin_var = "hard_news_bin3", score_version = "hard_perc",
                              parentheses = TRUE)
consump_labels3 <- paste(c("Low News\nConsumption\n", "Medium News\nConsumption\n", 
                           "High News\nConsumption\n"), 
                         consump_labels3, sep = "")

# Estimate persuasion within each relative volume tercile
consump_vsent_fx3 <- vsent_plot(var = "hard_perc", nbins = 3, 
                                labels = consump_labels3, 
                                weights = FALSE)

# Figure L4: plot persuasion estimates for each subgroup
(l4 <- ggplot(na.omit(consump_vsent_fx3 %>% filter(id != "Fox vs.\nMSNBC")), 
              aes(x=factor(val),
                  col = factor(id, levels = c("Fox vs.\nEntertainment",
                                              "MSNBC vs.\nEntertainment")),
                  shape = factor(id, levels = c("Fox vs.\nEntertainment",
                                                "MSNBC vs.\nEntertainment")))) +
    geom_hline(yintercept=0, col = "white") +
    geom_hline(yintercept=0, linetype="dashed", color = grey_dark) +
    geom_errorbar(aes(ymin=min_cilo90, ymax=max_cihi90),
                  width=0, lwd = 1, position = position_dodge(width = 0.5)) +
    geom_errorbar(aes(ymin=min_cilo, ymax=max_cihi),
                  width=0, position = position_dodge(width = 0.5)) +
    geom_point(aes(y=naive),
               position = position_dodge(width = 0.5),
               size = 2) + 
    facet_wrap(~ outcome,nrow=1) +
    scale_x_discrete(labels = unique(consump_vsent_fx3$bin)) + 
    xlab("Relative Volume of News vs. Non-News (Hard News Only)") +
    ylab("Average Treatment Effect of\nPartisan Media vs. Entertainment") +
    scale_y_continuous(breaks=seq(-0.2,0.2,0.1),
                       labels=plot_labels(break_val = 0.2, by_val = 0.1)$att,
                       limits = c(-0.225, 0.225),
                       sec.axis = dup_axis(name="",
                                           breaks=seq(-0.2,0.2,0.1),
                                           labels = plot_labels(break_val = 0.2, by_val = 0.1)$share)) +
    scale_colour_manual("Comparison",values=c(red_mit, blue_mit)) +
    scale_shape_manual("Comparison",values=c(16, 17, 15)) +
    theme(axis.text.x = element_text(size = 10, angle = 0, hjust = 0.5, color = "black")))

ggsave(l4, path = plot_path, filename = "fig_l4.pdf", 
       width=9, height=5.25, dpi = 600)
